library(naniar)
library(broom)
library(ggmap)
library(knitr)
library(lubridate)
library(timeDate)
library(tsibble)
library(here)
library(readr)
library(tidyverse)
library(ggResidpanel)
library(gridExtra)
library(kableExtra)
tree_data0 <- read_csv("Data/Assignment_data.csv")
Part I
Question 1: Rename the variables Date Planted and Year Planted to Dateplanted and Yearplanted using the rename() function. Make sure Dateplanted is defined as a date variable. Then extract from the variable Dateplanted the year and store it in a new variable called Year. Display the first 6 rows of the data frame. (5pts)
tree_data <- as_tibble(tree_data0) %>%
rename(Dateplanted=c("Date Planted"),
Yearplanted=c("Year Planted")) %>%
mutate(Dateplanted = dmy(Dateplanted)) %>%
mutate(Year = year(Dateplanted))
tree_data %>%
head() %>%
kable(caption = "Tree Data") %>%
kable_styling(latex_options = c("scale_down", "hold_position"))
Tree Data
|
CoM ID
|
Common Name
|
Scientific Name
|
Genus
|
Family
|
Diameter Breast Height
|
Yearplanted
|
Dateplanted
|
Age Description
|
Useful Life Expectency
|
Useful Life Expectency Value
|
Precinct
|
Located in
|
UploadDate
|
CoordinateLocation
|
Latitude
|
Longitude
|
Easting
|
Northing
|
Year
|
|
1057605
|
White Poplar
|
Populus alba
|
Populus
|
Salicaceae
|
NA
|
1900
|
2000-01-01
|
NA
|
NA
|
NA
|
NA
|
Park
|
30/9/20
|
(-37.790893782047654, 144.92257141425543)
|
-37.79089
|
144.9226
|
317080.6
|
5815353
|
2000
|
|
1028440
|
London Plane
|
Platanus x acerifolia
|
Platanus
|
Platanaceae
|
62
|
1900
|
2000-01-02
|
Mature
|
6-10 years (>50% canopy)
|
10
|
NA
|
Park
|
30/9/20
|
(-37.84544751087332, 144.97904358884796)
|
-37.84545
|
144.9790
|
322184.5
|
5809408
|
2000
|
|
1058665
|
Small-leaved Linden
|
Tilia cordata
|
Tilia
|
Malvaceae
|
19
|
2000
|
2000-05-29
|
Semi-Mature
|
31-60 years
|
60
|
NA
|
Street
|
30/9/20
|
(-37.798902715216066, 144.96852132025538)
|
-37.79890
|
144.9685
|
321146.2
|
5814553
|
2000
|
|
1026352
|
Variegated Elm
|
Ulmus minor
|
Ulmus
|
Ulmaceae
|
26
|
1900
|
2000-01-02
|
Semi-Mature
|
21-30 years
|
30
|
NA
|
Street
|
30/9/20
|
(-37.81160317631319, 144.98355174888286)
|
-37.81160
|
144.9836
|
322500.1
|
5813172
|
2000
|
|
1038440
|
Canary Island Pine
|
Pinus canariensis
|
Pinus
|
Pinaceae
|
91
|
1900
|
2000-01-02
|
Mature
|
31-60 years
|
60
|
NA
|
Park
|
30/9/20
|
(-37.78240938533976, 144.96019784543748)
|
-37.78241
|
144.9602
|
320373.4
|
5816367
|
2000
|
|
1015128
|
London Plane
|
Platanus x acerifolia
|
Platanus
|
Platanaceae
|
99
|
1900
|
2000-01-02
|
Mature
|
11-20 years
|
20
|
NA
|
Street
|
30/9/20
|
(-37.79941634390208, 144.94984511347445)
|
-37.79942
|
144.9498
|
319503.0
|
5814460
|
2000
|
tree_data %>%
select(c(1:7)) %>%
head() %>%
kable(caption = "Tree Data") %>%
kable_styling(latex_options = c("hold_position"))
Tree Data
|
CoM ID
|
Common Name
|
Scientific Name
|
Genus
|
Family
|
Diameter Breast Height
|
Yearplanted
|
|
1057605
|
White Poplar
|
Populus alba
|
Populus
|
Salicaceae
|
NA
|
1900
|
|
1028440
|
London Plane
|
Platanus x acerifolia
|
Platanus
|
Platanaceae
|
62
|
1900
|
|
1058665
|
Small-leaved Linden
|
Tilia cordata
|
Tilia
|
Malvaceae
|
19
|
2000
|
|
1026352
|
Variegated Elm
|
Ulmus minor
|
Ulmus
|
Ulmaceae
|
26
|
1900
|
|
1038440
|
Canary Island Pine
|
Pinus canariensis
|
Pinus
|
Pinaceae
|
91
|
1900
|
|
1015128
|
London Plane
|
Platanus x acerifolia
|
Platanus
|
Platanaceae
|
99
|
1900
|
tree_data %>%
select(c(8:13)) %>%
head() %>%
kable(caption = "Tree Data") %>%
kable_styling(latex_options = c("hold_position"))
Tree Data
|
Dateplanted
|
Age Description
|
Useful Life Expectency
|
Useful Life Expectency Value
|
Precinct
|
Located in
|
|
2000-01-01
|
NA
|
NA
|
NA
|
NA
|
Park
|
|
2000-01-02
|
Mature
|
6-10 years (>50% canopy)
|
10
|
NA
|
Park
|
|
2000-05-29
|
Semi-Mature
|
31-60 years
|
60
|
NA
|
Street
|
|
2000-01-02
|
Semi-Mature
|
21-30 years
|
30
|
NA
|
Street
|
|
2000-01-02
|
Mature
|
31-60 years
|
60
|
NA
|
Park
|
|
2000-01-02
|
Mature
|
11-20 years
|
20
|
NA
|
Street
|
tree_data %>%
select(c(14:20)) %>%
head() %>%
kable(caption = "Tree Data") %>%
kable_styling(latex_options = c("hold_position"))
Tree Data
|
UploadDate
|
CoordinateLocation
|
Latitude
|
Longitude
|
Easting
|
Northing
|
Year
|
|
30/9/20
|
(-37.790893782047654, 144.92257141425543)
|
-37.79089
|
144.9226
|
317080.6
|
5815353
|
2000
|
|
30/9/20
|
(-37.84544751087332, 144.97904358884796)
|
-37.84545
|
144.9790
|
322184.5
|
5809408
|
2000
|
|
30/9/20
|
(-37.798902715216066, 144.96852132025538)
|
-37.79890
|
144.9685
|
321146.2
|
5814553
|
2000
|
|
30/9/20
|
(-37.81160317631319, 144.98355174888286)
|
-37.81160
|
144.9836
|
322500.1
|
5813172
|
2000
|
|
30/9/20
|
(-37.78240938533976, 144.96019784543748)
|
-37.78241
|
144.9602
|
320373.4
|
5816367
|
2000
|
|
30/9/20
|
(-37.79941634390208, 144.94984511347445)
|
-37.79942
|
144.9498
|
319503.0
|
5814460
|
2000
|
Question 2: Have you noticed any differences between the variables Year and Yearplanted? Why is that? Demonstrate your claims using R code. Fix the problem if there is one (Hint: Use ifelse inside a mutate function to fix the problem and store the data in tree_data_clean). After this question, please use the data in tree_data_clean to proceed. (3pts)
tree_data_clean <- tree_data %>%
mutate(Dateplanted = str_replace(as.character(Dateplanted),
"2000", as.character(Yearplanted))) %>%
mutate(Year = Yearplanted) %>%
mutate(ymd(Dateplanted))
Question 3: Investigate graphically the missing values in the variable Dateplanted for the last 1000 rows of the data set. What do you observe? (max 30 words) (2pts)
There is no missing values in the last 1000 observations of the variable Dateplanted.
tree_data_singlevariable <- tree_data_clean %>%
tail(1000) %>%
select(`Dateplanted`)
visdat::vis_miss(tree_data_singlevariable)

Question 4: What is the proportion of missing values in each variable in the tree data set? Display the results in descending order of the proportion. (2pts)
The missingness in the variables of the tree data set is listed below in decsending order of proportion.
miss_var_summary(tree_data_clean) %>%
arrange(desc(pct_miss))
## # A tibble: 21 x 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 Precinct 6828 100
## 2 Diameter Breast Height 1454 21.3
## 3 Age Description 1454 21.3
## 4 Useful Life Expectency 1454 21.3
## 5 Useful Life Expectency Value 1454 21.3
## 6 Dateplanted 2 0.0293
## 7 ymd(Dateplanted) 2 0.0293
## 8 Common Name 1 0.0146
## 9 Located in 1 0.0146
## 10 CoM ID 0 0
## # … with 11 more rows
Question 6: Create a map with the tree locations in the data set. (2pts)
# We have created the map below for you
melb_map <- read_rds(here::here("Data/melb-map.rds"))
# Here you just need to add the location for each tree into the map.
ggmap(melb_map) +
geom_point(data = tree_data_clean1,
aes(x = Longitude,
y = Latitude),
colour = "#006400",
alpha = 0.6,
size = 0.2)

Question 7: Create another map and draw trees in the Genus groups of Eucalyptus, Macadamia, Prunus, Acacia, and Quercus. Use the “Dark2” color palette and display the legend at the bottom of the plot. (8pts)
selected_group <- tree_data_clean1 %>%
filter(Genus %in% c("Eucalyptus", "Macadamia", "Prunus", "Acacia", "Quercus"))
ggmap(melb_map) +
geom_point(data = selected_group,
aes(x = Longitude,
y = Latitude),
colour = "#006400",
alpha = 0.6,
size = 0.2)+
theme(legend.position = "bottom") +
scale_color_brewer(palette = "Dark2") +
labs(title = "Map of trees belonging to the selected genus group")

Question 8: Filter the data tree_data_clean1 so that only the variables Year, Located in, and Common Name are displayed. Arrange the data set by Year in descending order and display the first 4 lines. Call this new data set tree_data_clean_filter. Then answer the following question using inline R code: When (Year), where (Located in) and what tree (Common Name) was the first tree planted in Melbourne according to this data set? (8pts)
tree_data_clean_filter <- tree_data_clean1 %>%
select(Year, `Located in`, `Common Name`) %>%
arrange(desc(Year))
head(tree_data_clean_filter, 4) %>%
kable(caption = "Selected Variables of Tree Data") %>%
kable_styling(latex_options = "hold_position")
Selected Variables of Tree Data
|
Year
|
Located in
|
Common Name
|
|
2000
|
Street
|
Small-leaved Linden
|
|
2000
|
Street
|
Spotted Gum
|
|
2000
|
Street
|
Drooping sheoak
|
|
2000
|
Park
|
Kanooka
|
tree_data_clean_filter_rename <- tree_data_clean_filter %>%
rename(location = `Located in`, common_name = `Common Name`)
The first tree was planted in 1900 at a Street and the tree name is
Question 9: How many trees were planted in parks and how many in streets? Tabulate the results (only for locations in parks and streets) using the function kable() from the kableExtra R package. (3pts)
tree_data_clean1 %>%
filter(`Located in` %in% c("Park", "Street")) %>%
group_by(`Located in`) %>%
summarise(`number of trees` = n()) %>%
kableExtra::kable(caption = "Tree Count by Location") %>%
kable_styling(latex_options = "hold_position")
Tree Count by Location
|
Located in
|
number of trees
|
|
Park
|
2737
|
|
Street
|
4088
|
Question 10: How many trees are there in each of the Family groups in the data set tree_data_clean1 (display the first 5 lines of the results in descending order)? (2pt)
tree_data_clean1 %>%
group_by(`Family`) %>%
summarise(`number of trees` = n())%>%
arrange(desc(`number of trees`)) %>%
head(5) %>%
kable(caption = "Tree Count by Family") %>%
kable_styling(latex_options = "hold_position")
Tree Count by Family
|
Family
|
number of trees
|
|
Myrtaceae
|
2102
|
|
Platanaceae
|
1512
|
|
Ulmaceae
|
1125
|
|
Fabaceae
|
327
|
|
Fagaceae
|
254
|
Question 11: Create a markdown table displaying the number of trees planted in each year (use variable Yearplanted) with common names Ironbark, Olive, Plum, Oak, and Elm (Hint: Use kable() from the gridExtra R package). What is the oldest most abundant tree in this group? (8pts)
Elm is the oldest most abundant tree in this group.
tree_data_clean1 %>%
filter(`Common Name` %in% c("Ironbark", "Olive", "Plum", "Oak", "Elm")) %>%
group_by(`Yearplanted`, `Common Name`) %>%
summarise(`number of trees` = n())%>%
arrange(`Yearplanted`,desc(`number of trees`)) %>%
kableExtra::kable(caption = "Tree Count by Year") %>%
kable_styling(latex_options = "hold_position")
Tree Count by Year
|
Yearplanted
|
Common Name
|
number of trees
|
|
1900
|
Elm
|
179
|
|
1900
|
Ironbark
|
29
|
|
1900
|
Olive
|
17
|
|
1900
|
Oak
|
4
|
|
2000
|
Ironbark
|
23
|
|
2000
|
Elm
|
18
|
|
2000
|
Oak
|
9
|
Question 13: Plot the trees within the diameter range that you have selected in Question 12, which are located in parks and streets on a map using 2 different colours to differentiate their locations (streets or parks). (6pts)
large_trees_data_parks <- tree_data_clean1 %>%
filter(`Diameter Breast Height` %in% c(41:99))
ggmap(melb_map) +
geom_point(data = large_trees_data_parks ,
aes(x = Longitude,
y = Latitude,
colour = `Located in`),
alpha = 0.6,
size = 1.0) +
theme(legend.position = "bottom") +
scale_color_brewer(palette = "Dark2") +
labs(title = "Map of Large Trees")

Question 14: Create a time series plot (using geom_line) that displays the total number of trees planted per year in the data set tree_data_clean1 that belong to the Families: Myrtaceae, Arecaceae, and Ulmaceae. What do you observe from the plot? (6pts)
We see that the number of trees that were planted decreases from 1900 to 2000. More trees belonging to Myrtaceae were planted with one tree uniquely planted in 1977.
Fig_data <- tree_data_clean1 %>%
filter(`Family` %in% c("Myrtaceae", "Arecaceae", "Ulmaceae")) %>%
group_by(`Yearplanted`, `Family`) %>%
summarise(`number of trees` = n()) %>%
arrange(desc(`number of trees`))
Fig_data %>%
ggplot() +
geom_line(mapping = aes(x = `Yearplanted`, y = `number of trees`, colour = `Family`)) +
geom_point(mapping = aes(x = `Yearplanted`, y = `number of trees`, colour = `Family`))+
theme(legend.position = "bottom") +
theme_bw() +
labs(title = "Year Planted vs Number of Trees")

Part 2: Simulation Exercise
Question 15: Create a data frame called simulation_data that contains 2 variables with names response and covariate. Generate the variables according to the following model: \(response = 3.5 \times covariate + epsilon\) where covariate is a variable that takes values \(0, 1, 2, \ldots, 100\) and \(\epsilon\) is generated according to a Normal distribution (Hint: Use the function rnorm() to generate \(epsilon\).) (3pts)
set.seed(2021)
simulation_data <- tibble(covariate = 0:100) %>%
mutate(response = 3.5 * covariate + rnorm(101, 0, 1))
Question 16: Display graphically the relationship between the variables response and covariate (1pt) using a point plot. Which kind of relationship do you observe? (2pts)
We observe a linear relationship where the response variable increases with the covariate.
simulation_data %>%
ggplot() +
geom_point(mapping = aes(x = `covariate`,
y = `response`),
colour = "red") +
theme_bw() +
labs(title = "Covariate vs Response")

Question 17: Fit a linear model between the variables response and covariate that you generate in Question 15 and display the model summary. (2pts)
simulation_data_lm <- lm(response~covariate, data=simulation_data)
summary(simulation_data_lm)
##
## Call:
## lm(formula = response ~ covariate, data = simulation_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.07431 -0.71466 0.05844 0.64196 2.25176
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.135896 0.199948 0.68 0.498
## covariate 3.493775 0.003455 1011.35 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.012 on 99 degrees of freedom
## Multiple R-squared: 0.9999, Adjusted R-squared: 0.9999
## F-statistic: 1.023e+06 on 1 and 99 DF, p-value: < 2.2e-16
Question 18: What are the values for the intercept and the slope in the estimated model in Question 17 (Hint: Use the function coef())? How do these values compare with the values in the simulation model? (max 50 words) (2pts)
#coef(summary(simulation_data_lm))
slope_intercept <- tidy(summary(simulation_data_lm)) %>%
select(term, estimate)
The generated model has a slope of 3.49 and an intercept of 0.14
The simulation data was generated from the equation, \(response = 3.5 \times covariate + epsilon\) where epsilon is an error factor. The generated linear model is of the form \(response = 3.4937754 \times covariate + 0.1358957\). The value 3.49 ~ 3.5 is the slope of the linear equation and the intercept of the model is 0.14. The fitted model differs from the simulation data in epsilon, which is centered around zero. The intercept of the model is close to zero.
#coef(summary(simulation_data_lm))
slope_intercept %>%
kable(caption = "Slope and Intercept")%>%
kable_styling(latex_options = "hold_position")
Slope and Intercept
|
term
|
estimate
|
|
(Intercept)
|
0.1358957
|
|
covariate
|
3.4937754
|
Question 20: Report R2, Radjusted, AIC, and BIC. Is this a good/bad model? Please explain your answer. (max 30 words) (2pts)
The model generated for the simulation data is a good model.
We know that the R squared value is a measure for the goodness of fit of a linear model which has values in the range [0, 1]. A good model has an R squared close to 1. The generated model has an r squared and r squared adjusted value of 0.9999 and, hence is a good model. The model with lowest AIC and BIC is a good model. For this model, the AIC and BIC are comparable and have low values. However, we do not have any other model for comparison and hence this model a good model as suggested by the r squared values.
glance(simulation_data_lm) %>%
select(r.squared, adj.r.squared, AIC, BIC) %>%
kable(caption = "Measures of Goodness of Fit")%>%
kable_styling(latex_options = "hold_position")
Measures of Goodness of Fit
|
r.squared
|
adj.r.squared
|
AIC
|
BIC
|
|
0.9999032
|
0.9999022
|
293.0547
|
300.9001
|